#include<iostream>
#include<vector>
#include<string>
#include <boost/filesystem.hpp>
#include "tool.hpp"
#include "log.hpp"


using namespace std;


const string src_path = "data/input";
const string raw = "data/raw_html/raw.txt";

typedef struct format
{
    string title;//标题
    string content;//内容
    string url;//url
}Format;

bool Readfile(const string &src_path,vector<string> *files_gather)
{
    boost::filesystem::path file_path(src_path);
    if(!boost::filesystem::exists(file_path))//判断stc_path路径是否不存在
    {
        cerr<<"src_path is does not exist"<<endl;
        return false;
    }

    boost::filesystem::recursive_directory_iterator end; //空迭代器，标志结束
    //boost::filesystem::directory_iterator 用于迭代指定目录的直接内容，不会递归遍历子目录

    //boost::filesystem::recursive_directory_iterator 用于递归遍历目录及其子目录的内容
    for(boost::filesystem::recursive_directory_iterator iter(file_path);iter!=end;iter++)//遍历
    {
        if(!boost::filesystem::is_regular_file(*iter))//我们需要后缀.html并且是普通文件
        {
           continue;
        }
        if(iter->path().extension()!=".html")
        {
          continue;
        }
        
        //cout<<*iter<<endl;
        //cout<<iter->path().string()<<endl;//debug
       // count++;
        files_gather->push_back(iter->path().string());

    }
    //cout<<count<<endl;
    return true;
}
static bool partitle(const string &result,string *title)
{
    size_t begin = result.find("<title>");
    if(begin == string::npos)
    {
        return false;
    }

    size_t end = result.find("</title>");
    if(end == string::npos)
    {
        return false;
    }

    begin += string("<title>").size();

    if(begin>end)
    {
        return false;
    }

    *title = result.substr(begin,end-begin);
    return true;
}
static bool parcontent(const string &result,string *content)
{
    enum state
    {
        Label,
        Content
    };

    state a =Label;
    for(char c : result)
    {
        switch (a)
        {
        case Label:
            if(c == '>')
                a =Content;
            break;
        case Content:
            if(c=='<')
                a=Label;
            else
            {
                if(c =='\n') c=' ';
                content->push_back(c);
            }
            break;
        default:
            break;
        }
    }    
    return true;
}
static void ShowDoc(const Format &doc)
{
    cout << "title: " << doc.title << endl;
    cout << "content: " << doc.content << endl;
    cout << "url: " << doc.url << endl;
}
static bool parturl(const string &file,string *url)
{
    //string url_head = "https://www.boost.org/doc/libs/1_86_0/libs/config/doc/html/";
    
    //string url_head = "https://www.boost.org/doc/libs/1_86_0/doc/html/";
    string url_head = "https://www.boost.org/doc/libs/1_78_0/doc/html";
    
    string url_tail = file.substr(src_path.size());
    *url =(url_head+url_tail);
    return true;
}
bool Anafile(vector<string> &files_gather,vector<Format> *outcome)
{
    //int k =2;
    for(string &file : files_gather)
    {
        string result;//读取文件内容
        if(!project_tool::Filetool::divestfile(file,&result))
        {
          continue;
        }
        
        Format temp;

        if(!partitle(result,&temp.title))//读取文档标题
        {
            continue;
        }
        
        if(!parcontent(result,&temp.content))//去标签
        {
            continue;
        }

        if(!parturl(file,&temp.url))
        {
            continue;
        }
        outcome->push_back(move(temp));//性能提升
    }
    return true;
}

bool SaveHtml(vector<Format> &outcome,const string &raw)
{
    const char c = '\3';

    ofstream out(raw, ios::out | ios::binary);
    if(!out.is_open()){
        cerr << "open " << raw << " failed!" << endl;
        return false;
    }

    for(Format &item : outcome){
        string temp_out;

        temp_out = item.title;
        temp_out += c;
        temp_out += item.content;
        temp_out += c;
        temp_out += item.url;
        temp_out += '\n';

        out.write(temp_out.c_str(), temp_out.size());
        if (out.fail()) {
        std::cerr << "Error occurred while writing to the file." << std::endl;
        return 1;
        }   
    }

    out.close();

    return true;
}

int main()
{
    vector<string> files_gather;
    //1.读取html文件的路径保存到files_gather，用于后续分析
    if(!Readfile(src_path,&files_gather))
    {
        cerr<<"Readfile is error"<<endl;
        return 1;
    }

    //2.分析读取后的文件,结果放到outcome
    vector<Format> outcome;
    if(!Anafile(files_gather,&outcome))
    {
        cerr<<"Anafile is error"<<endl;
        return 2;
    }

    //3.解析完的结果放到raw，用\3分隔
    if(!SaveHtml(outcome,raw))
    {
        cerr<<"SaveHtml is error"<<endl;
        return 3;
    }

    return 0;
}
